Methodology
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#reading the data
data = pd.read_csv("vehicle.csv")
data.head() #checking the head
data.shape #checking the shape of the data
data.describe().T
data.dtypes
data.skew()
# Checking for missing data
data.isnull().sum()
Many of the variables have missing data except compactness, max.length_aspect_ratio,max.length_rectangularity,hollows_ratio and class
# checking the median of the data
data.median()
#dealing with missing data
newData = data # to keep the original df constant, introducing a new df
#define a median filler
medianFiller = lambda x: x.fillna(x.median())
#apply the filler
newData.iloc[:,0:18] = data.iloc[:,0:18].apply(medianFiller)
#checking for missing data in the new dataframe
newData.isnull().sum()
All the missing values have been replaced and it can be seen the data frame has no missing values.
newData['class'].value_counts()
newData['class'].value_counts(1)*100
Since the class is an object variable now, it will be difficult to see the distribution of other columns with respect to the class and hence it is required to be converted into a numeric type using an Encoder.
newData.dtypes # checking the datatype before encoding
# using a Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_le = newData # encoded df
data_le['class'] = le.fit_transform(data_le['class'])
data_le.dtypes # checking the df after encoding
data_le['class'].value_counts()
sns.pairplot(data_le, hue = 'class')
Listing out only a few observations
# Checking for Correlation
corr = data_le.corr()
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize = (15, 15))
cmap = sns.diverging_palette(220, 10, as_cmap = True)
sns.heatmap(corr, mask = mask, cmap = cmap, vmax = 1, center = 0, square = True,
linewidths = .5, cbar_kws = {"shrink": .5}, annot = True)
So it is observed there is a fair bit of correlation among the features and this will drive the inclusion of most of them to build a base model.
# preparing data - dropping only the class column
from sklearn.model_selection import train_test_split
X = data_le.drop(columns = ['class']) # Predictor feature columns
Y = data_le['class'] # Predicted class
#preparing the training and testing data set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=25)
#check the data split
print("{0:0.2f}% data is in training set".format((len(x_train)/len(data_le.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test)/len(data_le.index)) * 100))
#SVM - non scaled version
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
SVM = svm.SVC(random_state=25)
SVM.fit(x_train, y_train)
SVM_train_predict = SVM.predict(x_train)
SVM_test_predict = SVM.predict(x_test)
print('SVM accuracy for train set: {0:.3f}'.format(SVM.score(x_train, y_train)))
print('SVM accuracy for test set: {0:.3f}'.format(SVM.score(x_test, y_test)))
# Classification Report
print('\n{}'.format(classification_report(y_test, SVM_test_predict)))
# Accuracy Score
auc = accuracy_score(y_test, SVM_test_predict)
print('\nAccuracy Score:', auc.round(3))
# Performing Cross Validation
from sklearn.model_selection import cross_val_score
svm_cvs = cross_val_score(svm.SVC(random_state=25), X, Y)
svm_cvs_mean = svm_cvs.mean()
svm_cvs_mean
# preparing data - dropping columns which doesn't seem to influence the predictor variable
Xnew = data_le.drop(columns = ['class', 'skewness_about', 'skewness_about.1', 'skewness_about.2', 'scaled_radius_of_gyration.1']) # Predictor feature columns
Ynew = data_le['class'] # Predicted class
#preparing the training and testing data set
x_trainN, x_testN, y_trainN, y_testN = train_test_split(Xnew, Ynew, test_size=0.3, random_state=25)
#check the data split
print("{0:0.2f}% data is in training set".format((len(x_trainN)/len(data_le.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_testN)/len(data_le.index)) * 100))
#SVM - non scaled version and columns dropped
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
SVMN = svm.SVC(random_state=25)
SVMN.fit(x_trainN, y_trainN)
SVMN_train_predict = SVMN.predict(x_trainN)
SVMN_test_predict = SVMN.predict(x_testN)
print('SVM accuracy for train set: {0:.3f}'.format(SVMN.score(x_trainN, y_trainN)))
print('SVM accuracy for test set: {0:.3f}'.format(SVMN.score(x_testN, y_testN)))
# Classification Report
print('\n{}'.format(classification_report(y_testN, SVMN_test_predict)))
# Accuracy Score
aucNew = accuracy_score(y_testN, SVMN_test_predict)
print('\nAccuracy Score:', aucNew.round(3))
# Performing Cross Validation
svmN_cvs = cross_val_score(svm.SVC(random_state=25), Xnew, Ynew)
svmN_cvs_mean = svmN_cvs.mean()
svmN_cvs_mean
There is no significant improvement in the accuracy of the model atter dropping more features.
from sklearn.decomposition import PCA
pca = PCA(n_components=18) # no. of components = no. of features
pca.fit(X)
# print(pca.explained_variance_)
# print(pca.components_)
print(pca.explained_variance_ratio_)
plt.bar(list(range(1,19)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('Eigen Value')
plt.show()
plt.step(list(range(1,19)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
It can be observed the first component accounts for more than 95% of the variance and hence only one component will be considered further for buliding the model.
pca = PCA(n_components=1)
pca.fit(X)
# print(pca.components_)
# print(pca.explained_variance_ratio_)
Xpca = pca.transform(X)
# using principal components to fit an SVM model
model_pca = svm.SVC(random_state=25)
model_pca.fit(Xpca, Y)
model_pca.score(Xpca, Y)
# scaling the features
from scipy.stats import zscore
XScaled=X.apply(zscore)
XScaled.head()
#preparing the training and testing data set for scaled data
x_trainS, x_testS, y_trainS, y_testS = train_test_split(XScaled, Y, test_size=0.3, random_state=25)
#check the data split
print("{0:0.2f}% data is in training set".format((len(x_trainS)/len(data_le.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_testS)/len(data_le.index)) * 100))
#SVM - scaled version
SVMScaled = svm.SVC(random_state=25)
SVMScaled.fit(x_trainS, y_trainS)
SVM_train_predictS = SVMScaled.predict(x_trainS)
SVM_test_predictS = SVMScaled.predict(x_testS)
print('SVM accuracy for train set: {0:.3f}'.format(SVMScaled.score(x_trainS, y_trainS)))
print('SVM accuracy for test set: {0:.3f}'.format(SVMScaled.score(x_testS, y_testS)))
# Classification Report
print('\n{}'.format(classification_report(y_testS, SVM_test_predictS)))
# Accuracy Score
aucS = accuracy_score(y_testS, SVM_test_predictS)
print('\nAccuracy Score:', aucS.round(3))
# Performing Cross Validation
svmS_cvs = cross_val_score(svm.SVC(random_state=25), XScaled, Y)
svmS_cvs_mean = svmS_cvs.mean()
svmS_cvs_mean
pcaS = PCA(n_components=18)
pcaS.fit(XScaled)
plt.bar(list(range(1,19)),pcaS.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,19)),np.cumsum(pcaS.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
# Considering 6 components which represent close to 95% of the variance
pcaS = PCA(n_components=6)
pcaS.fit(XScaled)
#print(pcaS.components_)
print(pcaS.explained_variance_ratio_)
XpcaS = pcaS.transform(XScaled)
model_pcaS = svm.SVC(random_state=25)
model_pcaS.fit(XpcaS, Y)
model_pcaS.score(XpcaS, Y)
Checking the accuracies on the Test data set
By dropping the dimensions to 1 for the non-scaled model the accuracy has reduced by 5% while in the case of scaled features by dropping the dimensions to 6 there is a drop of around 6% in the accuracy. The scaled model perfomed well and has a good test accuracy of around 90% with PCA using the SVM model.